from IPython.display import Image
Image(url="http://trackmaven.com/assets/tm_logo_horizontal.png")
import requests
response = requests.get("http://isitlunchtimeyet.com/", auth=('fheisler', 'passw0rd'))
response
<Response [200]>
print response.text
<html> <head> <script type="text/javascript" src="isit.js"></script> <link rel="stylesheet" type="text/css" href="s.css" /> <title>Is it lunch time yet?</title> </head> <body> <div>by <a href="http://www.butterfat.net/">Butterfat, LLC</a></div> </body> </html>
import requests
class RateLimitError(Exception):
pass
class ClientError(Exception):
pass
class APIClient(object):
"""
A generic API client to handle interaction using the
requests module; throw specific errors based on various responses
"""
# Should be overwritten by implementations, i.e. HTTPError=FacebookHTTPError
HTTPError = requests.exceptions.HTTPError
ConnectionError = requests.exceptions.ConnectionError
Error = ClientError
RateLimitError = RateLimitError
def __init__(self):
""" Throw exceptions if BASE_URI or BASE_PARAMS are not set """
if not hasattr(self, 'BASE_URI'):
raise NotImplementedError('Must specify a base uri')
if not hasattr(self, 'BASE_PARAMS'):
raise NotImplementedError('Must specify base params')
def _validate_response(self, response):
""" Optional response validation """
pass
def _validate_error(self, response):
"""
Optional error validation, for when you want to raise a specific
exception, for example rate limit exceptions.
"""
pass
def _get(self, url, params={}, timeout=10, retries=3):
"""
Gets a response based on the url and params passed to it;
will retry 3 times if there is a connection error
"""
full_url = '{}/{}'.format(self.BASE_URI, url)
while retries > 0:
try:
if hasattr(self, 'BASE_PARAMS'):
params.update(self.BASE_PARAMS)
response = requests.get(full_url, params=params)
try:
response.raise_for_status()
self._validate_response(response)
return response
except requests.exceptions.HTTPError as e:
self._validate_error(response)
raise self.HTTPError(e)
except (requests.exceptions.ConnectionError,
requests.exceptions.Timeout) as e:
retries -= 1
if not retries:
raise self.ConnectionError(e)
def _json(self, response):
""" Try to convert the response to JSON """
try:
return response.json()
except Exception as e:
raise self.Error(e)
import pandas as pd
import matplotlib.pyplot as plt
!pwd
!ls
/Users/fheisler/pyohio PyOhio - Social media data analysis.ipynb instagram.db blog.csv requirements.txt
import sqlite3
# Import Instagram picture data from SQL table
connection = sqlite3.connect('instagram.db')
instagram_data = pd.io.sql.read_sql("SELECT * FROM instagram;", con=connection)
instagram_data.head()
id | account | filter | likes | comments | caption | timestamp | |
---|---|---|---|---|---|---|---|
0 | 31044 | murphyoilusa | Valencia | 7 | 0 | Elvis is in the building at Murphy Express 855... | 140109 11:17 |
1 | 32057 | murphyoilusa | Normal | 5 | 0 | Cool tanker! #murphyusa | 140129 11:40 |
2 | 6359 | murphyoilusa | Rise | 6 | 0 | Looking up! #murphyusa | 121205 12:11 |
3 | 6347 | murphyoilusa | Hefe | 4 | 0 | Quick Strike Energy! #murphyusa | 121205 12:23 |
4 | 6351 | murphyoilusa | X-Pro II | 4 | 0 | Look out, it's Super Murph! #murphyusa | 121205 12:16 |
instagram_data.dtypes
id int64 account object filter object likes int64 comments int64 caption object timestamp object dtype: object
instagram_data.describe()
id | likes | comments | |
---|---|---|---|
count | 34575.000000 | 34575.000000 | 34575.000000 |
mean | 21677.659205 | 3690.893304 | 50.993955 |
std | 10841.855251 | 13330.935345 | 218.687328 |
min | 3451.000000 | 0.000000 | 0.000000 |
25% | 12272.500000 | 31.000000 | 0.000000 |
50% | 20929.000000 | 208.000000 | 4.000000 |
75% | 31439.500000 | 1297.000000 | 21.000000 |
max | 40191.000000 | 260107.000000 | 12567.000000 |
instagram_data['social_actions'] = instagram_data.likes + instagram_data.comments
instagram_data['social_actions'].median()
215.0
instagram_data.social_actions.hist(bins=50);
instagram_data[instagram_data.social_actions > 50000].social_actions.hist(bins=50)
plt.title("Distribution of social actions (50k+)")
plt.ylabel("Total likes + comments")
plt.xlabel("Number of Instagram pictures");
# Top 10 filters by usage
top_filters = instagram_data.groupby('filter').size().order(ascending=False)
(100 * top_filters / float(sum(top_filters)))[:10]
filter Normal 58.522054 Lo-fi 5.512654 Amaro 5.055676 Mayfair 4.937093 Valencia 4.827187 X-Pro II 4.043384 Rise 3.704989 Hudson 3.271150 Hefe 2.111352 Sierra 1.512654 dtype: float64
# Top 10 filters by social engagement
social_actions = instagram_data.groupby('filter').social_actions
(social_actions.sum() / social_actions.count()).order(ascending=False)[:10]
filter Normal 5285.500198 Willow 3827.500000 Sierra 2721.579350 Valencia 2259.502097 Amaro 2176.354119 Hudson 1932.223696 Mayfair 1900.403632 1977 1476.029851 Rise 1305.064793 Lo-fi 1174.589192 Name: social_actions, dtype: float64
# Top pics using Willow and Sierra filter
filter_top_filters = instagram_data['filter'].isin(["Willow", "Sierra"])
instagram_data[filter_top_filters].sort('social_actions', ascending=False)[:10]
id | account | filter | likes | comments | caption | timestamp | social_actions | |
---|---|---|---|---|---|---|---|---|
10658 | 31537 | nike | Sierra | 177037 | 3242 | Finish the season on your terms. #justdoit | 140119 22:18 | 180279 |
11182 | 26766 | nike | Sierra | 106291 | 567 | There's only one receiver who's always open.@u... | 131027 18:29 | 106858 |
11180 | 26705 | nike | Sierra | 103974 | 370 | No gym. No trainer. No excuse. #justdoit@nikew... | 131025 19:11 | 104344 |
17631 | 30798 | starbucks | Sierra | 92508 | 844 | Sometimes you just need to stay in. #Cozy #Cof... | 140103 13:08 | 93352 |
11175 | 22369 | nike | Sierra | 90401 | 408 | Make your next run your best run.The @nikerunn... | 130912 12:59 | 90809 |
17274 | 28780 | starbucks | Sierra | 88863 | 312 | A little piece of home at 37,000 ft.#starbucks... | 131127 14:27 | 89175 |
11220 | 7477 | nike | Sierra | 85967 | 536 | A hist�ria n�o se escreve sozinha.History does... | 130625 12:04 | 86503 |
17248 | 24237 | starbucks | Sierra | 74336 | 310 | Follow your #heart. Celebrate what you #love. ... | 130930 14:52 | 74646 |
17558 | 25366 | starbucks | Willow | 73481 | 273 | Let weather be your excuse to #slowdown and #s... | 131017 16:07 | 73754 |
17237 | 29043 | starbucks | Willow | 67709 | 226 | Perfect cup. #PourOver #Coffee #Love | 131204 13:31 | 67935 |
# Most engaging Instagram posters
posters = instagram_data.groupby('account').social_actions
top_posters = (posters.sum() / posters.count()).order(ascending=False)
top_posters[:5]
account starbucks 47178.250000 nike 41012.822034 disney 20712.481390 footlocker 20349.083185 apple 14354.261905 Name: social_actions, dtype: float64
# Average number of hashtags used by top brands
instagram_data['num_hashtags'] = instagram_data.caption.str.count("#")
tag_counts = instagram_data.groupby('account').num_hashtags
avg_tags = (tag_counts.sum() / tag_counts.count())
pd.concat([top_posters, avg_tags], axis=1).sort('social_actions', ascending=False).num_hashtags[:5]
starbucks 1.774554 nike 1.241525 disney 0.620347 footlocker 1.899584 apple 0.619048 Name: num_hashtags, dtype: float64
# Plot most effective number of hashtags
tag_effect = instagram_data.groupby('num_hashtags').social_actions
plt.plot((tag_effect.sum() / tag_effect.count())[:5]);
plt.ylabel("Average number of likes + comments");
plt.xlabel("Number of hashtags used");
# Biggest hashtags users
avg_tags.order(ascending=False)[:5]
account ross_stores 29.978723 goavisbudget 18.104478 comcast_xfinity 12.000000 gamestop_worldwide 9.115385 costco_wholesale 7.326531 Name: num_hashtags, dtype: float64
# Find the best day of week to post
import arrow
# Create an Arrow timestamp
instagram_data['day'] = instagram_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))
# Format to day of week, 1 through 7
instagram_data.day = instagram_data.day.apply(format, args=("d",))
dow_effect = instagram_data.groupby('day').social_actions
(dow_effect.sum() / dow_effect.count()).plot(kind='bar');
plt.ylabel("Avg likes + comments");
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));
import nltk
# Import blog data from CSV file
blog_data = pd.read_csv("blog.csv")
blog_data.head()
id | title | summary | timestamp | fb_likes | fb_shares | linkedin_shares | pins | tweets | |
---|---|---|---|---|---|---|---|---|---|
0 | 6902 | 15 Things You Probably Didn't Know About 'Good... | <p>“YOU LIKE APPLES?”</p> ... | 131121 18:45 | 2910 | 1353 | 1 | 30 | 256 |
1 | 10924 | New Footage Of Baby Lil Bub Is Just As Magical... | <p>Be still my heart.</p> ... | 140117 12:30 | 408 | 431 | 0 | 55 | 92 |
2 | 10922 | 15 Iconic People You Had No Idea Were The Same... | <p>MIND. BLOWN.</p> ... | 140117 10:00 | 1114 | 833 | 4 | 28 | 237 |
3 | 10921 | Activist Gives Speech Inside Mitch McConnell's... | <p>“So I’m asking you the million ... | 140117 11:00 | 1589 | 590 | 2 | 1 | 202 |
4 | 10545 | The Definitive 2014 Golden Globes Eyewear Ranking | <p>Matt Damon was “the garbage man who d... | 140113 19:30 | 259 | 184 | 7 | 11 | 111 |
# Clean up HTML tags (not quite safe!)
import re
blog_data.summary = blog_data.summary.apply(lambda s: re.sub("<[^<]+?>", "", s))
# Remove multiple spaces
blog_data.summary = blog_data.summary.apply(lambda s: re.sub(' +', ' ', s))
# Remove leading and trailing spaces
blog_data.title = blog_data.title.apply(lambda t: t.strip())
blog_data.summary = blog_data.summary.apply(lambda s: s.strip())
# Decode HTML entities
from lxml import html
blog_data.summary = blog_data.summary.apply(lambda s: html.fromstring(s).text)
# Collapse social shares
blog_data['shares'] = blog_data.fb_likes + blog_data.fb_shares + blog_data.linkedin_shares + blog_data.pins + blog_data.tweets
blog_data = blog_data.drop(['fb_likes', 'fb_shares', 'linkedin_shares', 'pins', 'tweets'], axis=1);
blog_data.shares.describe()
count 25188.000000 mean 9532.819557 std 51112.025280 min 12.000000 25% 391.000000 50% 1194.000000 75% 4251.000000 max 3349344.000000 dtype: float64
# Top performing blog post
blog_data[blog_data.shares == blog_data.shares.max()]
id | title | summary | timestamp | shares | |
---|---|---|---|---|---|
5354 | 12008 | What Career Should You Actually Have? | Do what you love, love what you do. | 140130 23:30 | 3349344 |
title_word_bag = blog_data.title.apply(lambda t: t + " ").sum()
# Top 10 most common words
from collections import Counter
Counter(title_word_bag.split()).most_common()[:10]
[('The', 9423), ('To', 5390), ('Of', 5261), ('A', 5070), ('You', 4343), ('In', 3528), ('Is', 3206), ('This', 2638), ('And', 2555), ('That', 2363)]
# Top 10 most common non-stopwords
stopwords = [unicode(word) for word in nltk.corpus.stopwords.words('english')]
title_words = [word for word in title_word_bag.split() if word.lower() not in stopwords]
Counter(title_words).most_common()[:10]
-c:3: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
[('Things', 1543), ('New', 969), ('People', 879), ('Reasons', 833), ('Make', 810), ('21', 736), ('Best', 710), ('Like', 691), ('Know', 654), ('Ever', 632)]
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(title_words)
# Filter to only bigrams that appear 20+ times
bigram_finder.apply_freq_filter(20)
bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]
[(('World', 'Cup'), 0.0010484454085321765), (('Signs', "You're"), 0.0008797300554350446), (("Didn't", 'Know'), 0.0008737045071101471), (('Daily', 'Links'), 0.000777295733911786), (('Look', 'Like'), 0.000777295733911786), (('New', 'York'), 0.0007712701855868884), (('Definitive', 'Ranking'), 0.0007531935406121957), (('Looks', 'Like'), 0.0007170402506628103), (('Miley', 'Cyrus'), 0.0006748614123885273), (("'Game", "Thrones'"), 0.0006567847674138346)]
# Top 10 bigrams with the highest PMI (pointwise mutual information)
bigram_finder.nbest(bigram_measures.pmi, 10)
[('Mariah', 'Carey'), ("'Let", "Go'"), ("'Wrecking", "Ball'"), ('Leonardo', 'DiCaprio'), ('Zac', 'Efron'), ('Amy', 'Poehler'), ('Jay', 'Z'), ('Fault', "Stars'"), ('Los', 'Angeles'), ('Fab', 'Drab')]
# Examine "top list"-type posts; titles begin with a number
blog_data['list_post'] = blog_data.title.apply(lambda t: t[0].isdigit())
# How many posts are "top list" posts?
float(blog_data.list_post.sum())/blog_data.list_post.count()
0.37156582499602986
# How effective are list-type posts?
list_effect = blog_data.groupby('list_post').shares
(list_effect.sum() / list_effect.count()).plot(kind='bar');
plt.ylabel("Avg social shares");
# Examine "question"-type posts; title ends with a "?"
blog_data['question_post'] = blog_data.title.apply(lambda t: t[-1] is "?")
# How many posts are "question" posts?
float(blog_data.question_post.sum()) / blog_data.question_post.count()
0.06229156741305383
# How effective are question posts?
question_effect = blog_data.groupby('question_post').shares
(question_effect.sum() / question_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");
# Do you want to find out what these amazing questions are?
blog_data[blog_data.question_post].sort('shares', ascending=False).title[:10]
5354 What Career Should You Actually Have? 5174 What Kind Of Dog Are You? 11253 What State Do You Actually Belong In? 4220 Which Decade Do You Actually Belong In? 10968 What City Should You Actually Live In? 12827 Which Mythical Creature Are You? 9128 QUIZ: What Food Matches Your Personality? 12352 What Actress Would Play You In The Movie Versi... 3641 Which European Country Do You Actually Belong In? 4950 What Should Your College Major Actually Be? Name: title, dtype: object
blog_data['actually'] = blog_data.title.apply(lambda t: "actually" in t.lower())
# How many posts are "actually" posts?
float(blog_data.actually.sum()) / blog_data.actually.count()
0.021200571700809908
# But actually, how effective are they?
actual_effect = blog_data.groupby('actually').shares
(actual_effect.sum() / actual_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");
# Examine post title length
blog_data['title_length'] = blog_data.title.apply(lambda t: len(t))
# Distribution of title lengths
blog_data.title_length.hist();
plt.ylabel("Number of posts");
plt.xlabel("Number of characters in title");
# Most effective title lengths
title_len_effect = blog_data.groupby('title_length').shares
plt.plot((title_len_effect.sum() / title_len_effect.count()));
plt.ylabel("Average shares");
plt.xlabel("Number of characters in title");
# Examine post summary length
blog_data['summary_length'] = blog_data.summary.apply(lambda t: len(t))
# Distribution of summary lengths
blog_data.summary_length.hist(bins=50);
plt.xlabel("Number of characters in summary")
plt.ylabel("Number of posts");
# Highly skewed distribution; save a log-transformed summary length for later
blog_data['summary_log_len'] = blog_data.summary_length.apply(log)
blog_data.summary_log_len.hist();
# Bin summary lengths
bins = range(0, 3000, 100)
blog_data['binned_summary_length'] = pd.cut(blog_data.summary_length, bins=bins, labels=bins[1:])
# Most effective post summary lengths
summary_len_effect = blog_data.groupby('binned_summary_length').shares
plt.plot(bins[1:], summary_len_effect.sum() / summary_len_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Number of characters in summary");
import arrow
# Convert timestamp to arrow object for manipulation
blog_data['timestamp'] = blog_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))
# Day of week distribution
blog_data['dow'] = blog_data.timestamp.apply(lambda ts: int(ts.format('d')))
blog_data.dow.hist(bins=7, range=(0,8));
plt.ylabel("Total number of posts")
plt.xticks(range(8), ("", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));
# Day of week effectiveness
dow_effect = blog_data.groupby('dow').shares
(dow_effect.sum() / dow_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares")
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));
# Time of day distribution (hourly)
blog_data['tod'] = blog_data.timestamp.apply(lambda ts: int(ts.format('HH')))
blog_data.tod.hist(bins=24, range=(0,24));
plt.ylabel("Avg social shares");
plt.xlabel("Hour of day (ET)");
# Time of day effectiveness
tod_effect = blog_data.groupby('tod').shares
(tod_effect.sum() / tod_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");
# Prepare readability scores based on Flesch-Kincaid Grade Level
from re import match
# Load Carnegie Mellon Pronouncing Dictionary
cmu = nltk.corpus.cmudict.dict()
def reduce_word(word):
return ''.join([x for x in word.lower() if match(r'\w', x)])
def get_syllable_count(word):
word = reduce_word(word)
if (not len(word)) or (not word in cmu):
return 0
return len([x for x in list(''.join(list(cmu[word])[-1])) if match(r'\d', x)])
def get_grade_level(text):
"""Flesch-Kincaid Grade Level formula"""
sentences = nltk.tokenize.sent_tokenize(text)
sentence_count = len(sentences)
word_count = 0
syllable_count = 0
for sentence in sentences:
words = nltk.tokenize.word_tokenize(sentence)
words = [reduce_word(word) for word in words]
words = [word for word in words if word != '']
word_count += len(words)
syllable_count += sum([get_syllable_count(word) for word in words])
if word_count is 0:
return 0
word_count = float(word_count)
return (0.39 * (word_count / sentence_count)
+ 11.8 * (syllable_count / word_count)
- 15.59)
blog_data['grade_level'] = blog_data.summary.apply(get_grade_level)
# Distribution of summary grade-level scores
blog_data.grade_level.hist(bins=30, range=(-10,20));
blog_data.grade_level.describe()
count 25188.000000 mean 5.098678 std 3.986889 min -15.200000 25% 2.589286 50% 4.823333 75% 7.430000 max 53.571852 dtype: float64
bins = range(-10, 20, 5)
blog_data['binned_grade_level'] = pd.cut(blog_data.grade_level, bins=bins, labels=bins[1:])
grade_lvl_effect = blog_data.groupby('binned_grade_level').shares
plt.plot(bins[1:], grade_lvl_effect.sum() / grade_lvl_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Flesch-Kinkaid grade level");
# What are these negative scores?
blog_data.sort(['grade_level', 'shares'], ascending=[True, False])[['title', 'summary', 'grade_level', 'shares']][:10]
title | summary | grade_level | shares | |
---|---|---|---|---|
13977 | Hodor? | Hodor? | -15.200 | 18962 |
17542 | How Misandrist Are You? | #BanMen | -15.200 | 17415 |
18336 | How Many F#@ks Do You Give? | DYGAF? | -15.200 | 6309 |
13461 | How Persian Are You? | Vuyyyyyy. | -15.200 | 5131 |
21488 | Which Celebrity Cat Are You? | meeeeeeeeeoooooow. | -15.200 | 4000 |
15144 | Should You Get Out Of Bed Today? | Hmmmmmmmmmmmmmmmmmm. | -15.200 | 3815 |
6343 | How Much Do You Hate Small Talk? | Soooo……… | -15.200 | 2256 |
349 | FYI, Lady Gaga And Christina Aguilera Have Rel... | SLAAAAAY. youtube.com | -15.200 | 510 |
4779 | Obama's Super Bowl Interview With Bill O'Reill... | JICYMI . | -15.200 | 192 |
17768 | When Rihanna Met Aaron Paul | #Pinkman4President. Twitter: @rihanna | -15.005 | 126 |
# Randomly shuffle rows
blog_data = blog_data.apply(np.random.permutation)
# Use these columns as features
feature_list = ['list_post',
'question_post',
'actually',
'title_length',
'summary_log_len',
'dow',
'tod',
'grade_level',
]
# Prepare only the columns we need (features + target)
reduced_blog_data = blog_data[feature_list + ['shares']]
# Check for NaN's
reduced_blog_data.isnull().any()
list_post False question_post False actually False title_length False summary_log_len False dow False tod False grade_level False shares False dtype: bool
from sklearn.preprocessing import StandardScaler
# Normalize the data
scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)
# Sample 80% of the data for training; keep 20% for testing
train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]
features = training_set[feature_list]
target = training_set.shares
from sklearn import linear_model
# Fit a linear classifier to the training data using stochastic gradient descent
# (probably not a great idea; features are likely to be highly correlated)
clf = linear_model.SGDRegressor()
clf.fit(features, target)
SGDRegressor(alpha=0.0001, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25, random_state=None, shuffle=False, verbose=0, warm_start=False)
# Predict results of testing set to measure accuracy
predicted_shares = clf.predict(testing_set[feature_list])
from sklearn.metrics import r2_score
# Measure the accuracy of the predictions
r2_score(testing_set.shares, predicted_shares)
-0.0032258761644354816
# Bin the number of shares into buckets
bins = [0, 1e3, 1e4, 1e5, blog_data.shares.max()]
blog_data['binned_shares'] = pd.cut(blog_data.shares, bins=bins, labels=bins[1:])
# Check the distribution of binned shares
blog_data.groupby('binned_shares').size()
binned_shares 1000 11532 10000 10087 100000 3150 3349344 419 dtype: int64
# As before, prepare the data...
reduced_blog_data = blog_data[feature_list] # + ['binned_shares']
scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)
norm_blog_data.isnull().any()
train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]
features = training_set[feature_list]
target = blog_data[:train_prop].binned_shares.astype(str)
true_test_shares = blog_data[train_prop:].binned_shares.astype(str)
# Fit a classifier on the binned shares and predict
clf = linear_model.SGDClassifier()
clf.fit(features, target)
predicted_shares = clf.predict(testing_set)
from sklearn.metrics import accuracy_score
# Measure the accuracy of binned predictions among the 4 categories
accuracy_score(true_test_shares, predicted_shares)
0.42119888844779674
Image("http://scikit-learn.org/stable/_static/ml_map.png")